''''
Obtain the characteristics of the data used, including the TTM phase, the distribution of gender and age.
Author: Meng Zhang
Date: January 2024

Input: Data/prolific_profile_anonym.csv
Output: print out the obtained statistics
'''

import pandas as pd
import numpy as np

# load the prolific file
df = pd.read_csv("Data/prolific_profile_anonym.csv")
# total number of participants
num_people = len(df)
print("The total number of people is ", num_people)

# filter all the participants through contemplating or preparing quit smoking to get the eligible users
df_qualified = df[df["Completion code"] == "CJ8QWQ2B"]
num_qualified = len(df_qualified)
print("The total number of eligible people is ", num_qualified)

#  check the gender distribution
num_female = len(df_qualified[df_qualified["Gender"] == "Woman (including Trans Female/Trans Woman)"])
print("The total number of eligible female is ", num_female)
print("The proportion of eligible female is ", np.round(num_female/num_qualified, 3))
num_male = len(df_qualified[df_qualified["Gender"] == "Man (including Trans Male/Trans Man)"])
print("The total number of eligible male is ", num_male)
print("The proportion of eligible male is ", np.round(num_male/num_qualified, 3))

#  check the age distribution
num_young = len(df_qualified[df_qualified["Age"] <= 25])
print("The proportion of young adults is: ", np.round(num_young/num_qualified, 3))
num_mid = len(df_qualified[(df_qualified["Age"]<=50)& (df_qualified["Age"]>25)])
print("The proportion of mid-aged adults is: ", np.round(num_mid/num_qualified, 3))
num_old = len(df_qualified[df_qualified["Age"] > 50])
print("The proportion of old peopple is: ", np.round(num_old/num_qualified, 3))
